{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyspark"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 生成一个1000个元素组成的随机字符数组"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "ls = [chr(random.randint(65,71)) \n",
    "     for i in range(1000)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "rdd = pyspark.SparkContext().parallelize(ls)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 构建一个map，key是随机字符，value是随机整数\n",
    "## 进行缓存"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "rdd2 = rdd.map(lambda x :(x,random.randint(0,10000))).cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('B', 6769), ('F', 8230), ('E', 3325), ('B', 148), ('F', 1956), ('D', 4230), ('A', 5862), ('D', 8148), ('A', 9723), ('D', 9276), ('E', 5407), ('G', 5177), ('F', 7832), ('B', 157), ('D', 5441), ('D', 2184), ('E', 3121), ('E', 4381), ('F', 7427), ('B', 4172), ('G', 4421), ('A', 7796), ('E', 1898), ('A', 2690), ('B', 5578), ('F', 2669), ('E', 2744), ('E', 5593), ('F', 2173), ('C', 5830)]\n"
     ]
    }
   ],
   "source": [
    "print(rdd2.take(30))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 进行groupByKey操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "grdd = rdd2.groupByKey()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## groupByKey会生成一个（key,[value迭代器]）组成的结构"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('B', <pyspark.resultiterable.ResultIterable at 0x1acef9c5160>)]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grdd.take(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 再来一次Map，这次执行获取最大值和最小值的操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mymap(gmap):\n",
    "    return (gmap[0],(min(gmap[1]),max(gmap[1])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('B', (104, 9917)),\n",
       " ('C', (63, 9979)),\n",
       " ('F', (56, 9958)),\n",
       " ('A', (10, 9790)),\n",
       " ('E', (168, 9929)),\n",
       " ('D', (30, 9925)),\n",
       " ('G', (95, 9954))]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grdd.map(lambda g: mymap(g)).collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 传统方式一、计算max和min，需要进行两次计算"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "maxval = rdd2.reduceByKey(lambda x,y:max(x,y)).collect()\n",
    "minval = rdd2.reduceByKey(lambda x,y:min(x,y)).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('B', 9917), ('C', 9979), ('F', 9958), ('A', 9790), ('E', 9929), ('D', 9925), ('G', 9954)]\n",
      "[('B', 104), ('C', 63), ('F', 56), ('A', 10), ('E', 168), ('D', 30), ('G', 95)]\n"
     ]
    }
   ],
   "source": [
    "print(maxval)\n",
    "print(minval)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 设计好map结构，也可以一次性算完"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('B', (9917, 104)),\n",
       " ('C', (9979, 63)),\n",
       " ('F', (9958, 56)),\n",
       " ('A', (9790, 10)),\n",
       " ('E', (9929, 168)),\n",
       " ('D', (9925, 30)),\n",
       " ('G', (9954, 95))]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rdd2.map(lambda x : (x[0],(x[1],x[1])))\\\n",
    ".reduceByKey(lambda x,y:(max(x[0],y[0]),min(x[1],y[1]))).collect()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
